{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "KyxtNaA0uTN5"
   },
   "source": [
    "## 1. Preparations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "y-_LIRVGuhqC"
   },
   "outputs": [],
   "source": [
    "# to install pytorch on colab\n",
    "from os import path\n",
    "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag\n",
    "platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())\n",
    "\n",
    "accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'\n",
    "\n",
    "!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "acOo9KHRugOi"
   },
   "outputs": [],
   "source": [
    "!pip install -U bcolz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "K79BtvaUu1Bw"
   },
   "outputs": [],
   "source": [
    "!pip install Pillow==4.0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "PG0R8S_auTN5"
   },
   "outputs": [],
   "source": [
    "from PIL import Image\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torchvision\n",
    "from torchvision import models,transforms,datasets\n",
    "import bcolz\n",
    "import time\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "zbiLBOHcuTOA"
   },
   "source": [
    "We will first precompute the outputs of Vgg16 model on our dataset and store these values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "Lnsjg6XguTOB"
   },
   "outputs": [],
   "source": [
    "use_gpu = torch.cuda.is_available()\n",
    "print('Using gpu: %s ' % use_gpu)\n",
    "\n",
    "dtype = torch.FloatTensor\n",
    "if use_gpu:\n",
    "    dtype = torch.cuda.FloatTensor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following commands will download the dataset and need to be run only once."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "dY9Pw-JyvW14"
   },
   "outputs": [],
   "source": [
    "%mkdir data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "nqMyL9JnvXX9"
   },
   "outputs": [],
   "source": [
    "%cd data/\n",
    "!wget http://files.fast.ai/data/dogscats.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "WkysvIFcGWaZ"
   },
   "outputs": [],
   "source": [
    "!unzip dogscats.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "6lp57KjUuTN9"
   },
   "outputs": [],
   "source": [
    "def save_array(fname, arr):\n",
    "    c=bcolz.carray(arr, rootdir=fname, mode='w')\n",
    "    c.flush()\n",
    "def load_array(fname):\n",
    "    return bcolz.open(fname)[:]\n",
    "\n",
    "normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
    "\n",
    "prep1 = transforms.Compose([\n",
    "                transforms.CenterCrop(224),\n",
    "                transforms.ToTensor(),\n",
    "                normalize,\n",
    "            ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "UnSiC7vPuTOG"
   },
   "outputs": [],
   "source": [
    "#uncomment the appropriate one\n",
    "#laptop data_dir = '/home/lelarge/courses/data/dogscats'\n",
    "#aws data_dir = '/home/ubuntu/data/dogscats'\n",
    "#colab  d\n",
    "data_dir = '/content/data/dogscats'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "sWcoX-7cuTOI"
   },
   "source": [
    "Initialize paths for dataset items"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "BzPdfEyPuTOJ"
   },
   "outputs": [],
   "source": [
    "dsets = {x: datasets.ImageFolder(os.path.join(data_dir, x), prep1)\n",
    "         for x in ['train', 'valid']}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "GndzCpm1uTOM"
   },
   "source": [
    "If you are running on CPU, you will probably need to lower the size of the batches. On Colab too :("
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "XAKlRRHpuTON"
   },
   "outputs": [],
   "source": [
    "batch_size = 4\n",
    "#batch_size = 64"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "A3DbqTpzuTOP"
   },
   "source": [
    "Initialize data loader that will fetch images from disk using num_workers parallel threads."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "ziTi2BZUuTOQ"
   },
   "outputs": [],
   "source": [
    "dset_loaders = {x: torch.utils.data.DataLoader(dsets[x], batch_size=batch_size,\n",
    "                                               shuffle=False, num_workers=0)\n",
    "                for x in ['train', 'valid']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "7YghjxOiuTOT"
   },
   "outputs": [],
   "source": [
    "dset_sizes = {x: len(dsets[x]) for x in ['train', 'valid']}\n",
    "dset_sizes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "cx36goi8uTOW"
   },
   "source": [
    "Instantiate VGG16 model pretrained on ImageNet from the ```torchvision``` model Zoo."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "S-GEqZ9LuTOZ"
   },
   "outputs": [],
   "source": [
    "model_vgg = models.vgg16(pretrained=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "aZFu8B2auTOb"
   },
   "outputs": [],
   "source": [
    "if use_gpu:\n",
    "    model_vgg = model_vgg.cuda()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "r_5iFrThuTOe"
   },
   "source": [
    "By default all the modules are initialized to train mode (```self.training = True```). Also be aware that some layers have different behavior during train/and evaluation (like _BatchNorm_, _Dropout_) so setting it matters.\n",
    "\n",
    "Also as a rule of thumb for programming in general, try to explicitly state your intent and set ```model.train()``` and ```model.eval()``` when necessary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "VJFtUkwMuTOf"
   },
   "outputs": [],
   "source": [
    "model_vgg.eval()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "vKglfdHQuTOj"
   },
   "source": [
    "## 2. Feature extraction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "4AlMYUbEuTOk"
   },
   "source": [
    "Function for extracting and storing CNN features, i.e. the ouput of VGG16 model in this case."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "ba_6nYrduTOl"
   },
   "outputs": [],
   "source": [
    "def prefeat(dataset):\n",
    "    features = []\n",
    "    labels_list = []\n",
    "    for data in dataset:\n",
    "        inputs,labels = data\n",
    "        if use_gpu:\n",
    "            inputs , labels = inputs.cuda(),labels.cuda()\n",
    "        else:\n",
    "            inputs , labels = inputs,labels\n",
    "        \n",
    "        x = model_vgg(inputs)\n",
    "        features.extend(x.data.cpu().numpy())\n",
    "        labels_list.extend(labels.data.cpu().numpy())\n",
    "    features = np.concatenate([[feat] for feat in features])\n",
    "    return (features,labels_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "-qh3jyXauTOn"
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "feat_train,lbs_train = prefeat(dset_loaders['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "a-KEGy-puTOq"
   },
   "outputs": [],
   "source": [
    "feat_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "k1D1sOQxuTOt"
   },
   "source": [
    "Loading and resizing the images every time we want to use them isn't necessary - instead we should save the processed arrays. By far the fastest way to save and load numpy arrays is using bcolz. This also compresses the arrays, so we save disk space. Here are the functions we'll use to save and load using bcolz (already loaded above...)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "OD9tNv54M7Xw"
   },
   "outputs": [],
   "source": [
    "%cd /content/data/dogscats/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "qmhsJ050NAw0"
   },
   "outputs": [],
   "source": [
    "%mkdir vgg16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "pNxL8YU-uTOx"
   },
   "outputs": [],
   "source": [
    "save_array(os.path.join(data_dir,'vgg16','feat_train.bc'),feat_train)\n",
    "save_array(os.path.join(data_dir,'vgg16','lbs_train.bc'),lbs_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "wX032qqsuTO0"
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "feat_val,lbs_val = prefeat(dset_loaders['valid'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "2W3Af-6JuTO5"
   },
   "outputs": [],
   "source": [
    "feat_val.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "collapsed": true,
    "id": "vNypRHhfuTO9"
   },
   "outputs": [],
   "source": [
    "save_array(os.path.join(data_dir,'vgg16','feat_val.bc'),feat_val)\n",
    "save_array(os.path.join(data_dir,'vgg16','lbs_val.bc'),lbs_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%cd /content/data/dogscats/\n",
    "!zip -r vgg16 vgg16/*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!pip install -U -q PyDrive\n",
    "\n",
    "from pydrive.auth import GoogleAuth\n",
    "from pydrive.drive import GoogleDrive\n",
    "from google.colab import auth\n",
    "from oauth2client.client import GoogleCredentials\n",
    "\n",
    "# 1. Authenticate and create the PyDrive client.\n",
    "auth.authenticate_user()\n",
    "gauth = GoogleAuth()\n",
    "gauth.credentials = GoogleCredentials.get_application_default()\n",
    "drive = GoogleDrive(gauth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "upload = drive.CreateFile({'title': 'vgg16_drive.zip'})\n",
    "upload.SetContentFile('vgg16.zip')\n",
    "upload.Upload()"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [
    "HMZBihNguTQr"
   ],
   "name": "04_dogscast_colab.ipynb",
   "private_outputs": true,
   "provenance": [],
   "toc_visible": true,
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python [conda env:pytorch]",
   "language": "python",
   "name": "conda-env-pytorch-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
